Rationale: this is basic QC for all samples of pure culture mycobiont and lichen. We will use to select which sample we canuse and comparisons

Create tpm table from kallisto output

counts<-read.delim2("../analysis_and_temp_files/06_meta_mapping/kallisto_mycobiont_only.txt",sep=" ")
metadata<-read.csv2("../analysis_and_temp_files/06_meta_mapping/metadata_shared_with_neha.csv",sep=",")
length(unique(counts$sample)) #this should be 31 as in the metadat.csv file
## [1] 31
counts$tpm<-round(as.numeric(counts$tpm),digits = 3) #round off tpm upto 3 decimals
counts_tab<-pivot_wider(counts[c(1,5,6)],names_from=sample,values_from = tpm) #widen the table

1. All Samples (Xanthoria and Lichen)

Calculate and plot pearson’s correlation coefficients between all the samples

all_counts<- counts_tab[rowSums(counts_tab[c(2:32)])>0,] #remove rows which have 0 in all samples
all<-cor(all_counts[c(2:32)], method="pearson",use="pairwise.complete.obs")
ord_all<-c("XBA1","XBA2","XSA1","XSA2","XSA2_2","XTA1","XTA2","XBC1","XBC2","XMC2","XSC1",
       "XSC2","XTC2","XBE1","XBE2","XSE2","XTE2","MP_I","MP_II","KS48XB1","KS48XB2","KS48XB3",
       "KS9XB1","KS9XB2","KS9XB3","S_21XB1","KS21XB1","S_21XB3","S_42XB1","S_42XB2","S_42XB3") #arrange order according to biological replicates (optional)
all1 <- all[, ord_all]
all1<-all1[ord_all, ]

# For correlation plot
corrplot(all1,method="color",type="full",is.corr=F, col=col2(200),addgrid.col = 0,tl.cex = 1, tl.col = "black")

# for PCA
plotMDS(all_counts[c(2:32)],main="All samples") 

# For heatmap
heat1<-column_to_rownames(all_counts, var = "target_id")
exp_matrix<-data.matrix(heat1)
mycols = colorRampPalette(c("blue","white","red"))(100) 
hc_rows <- hclust(as.dist(1-cor(t(exp_matrix), method="pearson")), method="average") # for clustering genes
hc_cols <- hclust(as.dist(1 - cor(exp_matrix, method = "pearson")), method = "average") # to arrange the sample names at x-axis according to hierarchical clustering
htmap<-heatmap.2(exp_matrix, Colv=as.dendrogram(hc_cols),Rowv=as.dendrogram(hc_rows),dendrogram = "both",trace = "none", 
                 col = mycols, keysize = 1, key.title = "Title",
                 scale = c("row"), cexRow = 0.75, cexCol = 1, srtCol = NULL,  
                 srtRow = NULL, adjRow = c(0,NA), sepcolor="white" ,margins = c(7,15), 
                 lwid = c(2,8),
                 lhei= c(2,8), main ="All samples", na.color="black") #this is the final one for the heat map

Conclusions:

  • Lichen and culture samples broadly form two groups, but with two big buts:
  • 2dpi cultures differ dramatically from the rest
  • MP_I and MP_II lichen samples (“filter”) group with the cultures (in the cloud of >2dpi). Perhaps, they were incubated for too long in the lab and weren’t happy?

2. Lichen Samples (excluding MP_I and MP_II)

Calculate and plot pearson’s correlation coefficients between Lichen samples

lich_counts<- counts_tab[rowSums(counts_tab[c(2:14,16:19)])>0,] #remove rows which have 0 in all samples
lich<-cor(lich_counts[c(2:14,16:19)], method="pearson",use="pairwise.complete.obs")
lich<-cor(counts_tab[c(2:14,16:19)], method="pearson",use="pairwise.complete.obs")
ord_lich<-c("XBA1","XBA2","XSA1","XSA2","XSA2_2","XTA1","XTA2","XBC1","XBC2","XMC2","XSC1",
           "XSC2","XTC2","XBE1","XBE2","XSE2","XTE2")
lich1 <- lich[, ord_lich]
lich1<-lich1[ord_lich, ]

# For correlation plot
corrplot(lich1,method="color",type="full",
         main="", is.corr=F, col=col2(200),addgrid.col = 0,tl.cex = 1, tl.col = "black")

# For multidimensional scaling
plotMDS(lich_counts[c(2:14,16:19)],main="Lichen samples") 

# For heatmap
heat<-lich_counts[c(1:14,16:19)]
heat1<-column_to_rownames(heat, var = "target_id")
exp_matrix<-data.matrix(heat1)
mycols = colorRampPalette(c("blue","white","red"))(100) 
hc_rows <- hclust(as.dist(1-cor(t(exp_matrix), method="pearson")), method="average") # for clustering genes
hc_cols <- hclust(as.dist(1 - cor(exp_matrix, method = "pearson")), method = "average") # to arrange the sample names at x-axis according to hierarchical clustering
htmap<-heatmap.2(exp_matrix, Colv=as.dendrogram(hc_cols),Rowv=as.dendrogram(hc_rows),dendrogram = "both",trace = "none", 
                 col = mycols, keysize = 1, key.title = "Title",
                 scale = c("row"), cexRow = 0.75, cexCol = 1, srtCol = NULL,  
                 srtRow = NULL, adjRow = c(0,NA), sepcolor="white" ,margins = c(7,15), 
                 lwid = c(2,8),
                 lhei= c(2,8), main ="lich samples", na.color="black") #this is the final one for the heat map

Conclusions:

  • Samples group based on substrate (PC1), with barks and twigs grouping together
  • Some grouping based on the thallus part is present on PC2, where apothecia are mostly in the upper part of the plot

3. Culture Samples

Calculate and plot pearson’s correlation coefficients between Xanthoria culture samples

xan_counts<- counts_tab[rowSums(counts_tab[c(21:32)])>0,] #remove rows which have 0 in xan samples
xan<-cor(xan_counts[c(21:32)], method="pearson",use="pairwise.complete.obs")
ord_xan<-c("KS48XB1","KS48XB2","KS48XB3",
           "KS9XB1","KS9XB2","KS9XB3","S_21XB1","KS21XB1","S_21XB3","S_42XB1","S_42XB2","S_42XB3") #arrange order according to biological replicates (optional)
xan1 <- xan[, ord_xan]
xan1<-xan1[ord_xan, ]
corrplot(xan1,method="color",type="full",is.corr=F, col=col2(200),addgrid.col = 0,tl.cex = 1, tl.col = "black")

# For multidimensional scaling
plotMDS(xan_counts[c(21:32)],main="Xanthoria samples") 

# For heatmap
heat<-xan_counts[c(1,21:32)]
heat1<-column_to_rownames(heat, var = "target_id")
exp_matrix<-data.matrix(heat1)
mycols = colorRampPalette(c("blue","white","red"))(100) 
hc_rows <- hclust(as.dist(1-cor(t(exp_matrix), method="pearson")), method="average") # for clustering genes
hc_cols <- hclust(as.dist(1 - cor(exp_matrix, method = "pearson")), method = "average") # to arrange the sample names at x-axis according to hierarchical clustering
htmap<-heatmap.2(exp_matrix, Colv=as.dendrogram(hc_cols),Rowv=as.dendrogram(hc_rows),dendrogram = "both",trace = "none",  
                 col = mycols, keysize = 1, key.title = "Title",
                 scale = c("row"), cexRow = 0.75, cexCol = 1, srtCol = NULL,  
                 srtRow = NULL, adjRow = c(0,NA), sepcolor="white" ,margins = c(7,15), 
                 lwid = c(2,8),
                 lhei= c(2,8), main ="Xanthoria samples", na.color="black") #this is the final one for the heat map

Conclusions:

  • The 2 dpi samples remain well isolated from the rest
  • Samples mostly group together based on the time point, with one exception
  • KS21XB1 is a bit different from the rest 21 dpi samples

4. ANI between the mycobiont transcriptome and the reference genome

between RNA data and our genome

  • Assemble transcriptomes (Trinity-v2.14.0) of one culture and one lichen samples and calculate ANIs between them and the predicted transcriptome from the genome
mkdir analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/
source package /tsl/software/testing/bin/trinity-2.14.0

Trinity  --seqType fq --max_memory 40G --left ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/KS48XB1_trimmed.bbmapmerged.non_rRNA.1.fq.gz  --right ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/KS48XB1_trimmed.bbmapmerged.non_rRNA.2.fq.gz --CPU 20 --output analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_trinity

Trinity  --seqType fq --max_memory 40G --left ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/XBE1_trimmed.bbmapmerged.non_rRNA.1.fq.gz  --right ../03_transcriptomic_analysis/analysis_and_temp_files/03_qc/trimmed_reads/XBE1_trimmed.bbmapmerged.non_rRNA.2.fq.gz --CPU 20 --output analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_trinity
  • Selected only those transcripts that map on to the reference genome
#align with BLAT
source package /tgac/software/testing/bin/blat-37x1 
blat analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -out=psl -noHead analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_blat.psl

#get list of transcripts that map by selecting the column with cut (48780 lines)
cut -f14 analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_blat.psl > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.txt

#remove duplicates with awk (21219 lines)
awk -i inplace '!seen[$0]++' analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.txt

#extract fastas with seqtk v1.4-r130
source package d6c2932f-b01d-4039-8736-a2e15d4867e2 
seqtk subseq analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.txt > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.fasta
  • Compared filtered transcripts to the predicted transcriptome with FastANI: 99.2197% identical
fastANI -r analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.fasta -q analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -o analysis_and_temp_files/06_annotate_lecanoro/GTX0501_XBE1_ani.txt
  • Treat the same culture sample
#align with BLAT
source package /tgac/software/testing/bin/blat-37x1 
blat analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -out=psl  analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_blat.psl

#get list of transcripts that map by selecting the column with cut (48780 lines)
cut -f14 analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_blat.psl > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.txt

#remove duplicates with awk (21219 lines)
awk -i inplace '!seen[$0]++' analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.txt

#extract fastas with seqtk v1.4-r130
source package d6c2932f-b01d-4039-8736-a2e15d4867e2 
seqtk subseq analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_trinity.Trinity.fasta analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.txt > analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.fasta

fastANI -r analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.fasta -q analysis_and_temp_files/06_annotate_lecanoro/GTX0501_pred/annotate_results/Xanthoria_parietina_GTX0501.mrna-transcripts.fa -o analysis_and_temp_files/06_annotate_lecanoro/GTX0501_KS48XB1_filtered_ani.txt
  • This time, ANI = 99.28, almost the same
  • Compared two transcriptomes to each other: 99.05% identical
fastANI -r analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/KS48XB1_transcripts_headers_mapped_to_GTX501.fasta -q analysis_and_temp_files/06_annotate_lecanoro/trinity_assemblies/XBE1_transcripts_headers_mapped_to_GTX501.fasta -o analysis_and_temp_files/06_annotate_lecanoro/XBE1_KS48XB1_filtered_ani.txt

5. Next steps: